package com.shujia.flink.state;

import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class Demo05ConsumeKafkaExactlyOnce {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        env.enableCheckpointing(20 * 1000);

        /*
         * Flink作为Kafka的消费端如何保证ExactlyOnce语义
         * 通过开启CheckPoint来将消费的偏移量和对应的计算结果一起CK到HDFS中
         * 如果遇到故障可以进行恢复
         */
        KafkaSource<String> kafkaSource = KafkaSource.<String>builder()
                .setBootstrapServers("master:9092,node1:9092,node2:9092")
                .setTopics("words_exactly_once")
                .setGroupId("flink_grp_01")
                // 不管时earliest还是latest再次消费时，如果有已经提交过的消费位置，则会从上一次消费到的位置恢复消费
                .setStartingOffsets(OffsetsInitializer.earliest())
                .setValueOnlyDeserializer(new SimpleStringSchema())
                .build();

        DataStreamSource<String> kafkaDS = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafkaSource");

        kafkaDS
                .flatMap((line, out) -> {
                    for (String word : line.split(",")) {
                        out.collect(word);
                    }
                }, Types.STRING)
                .map(word -> Tuple2.of(word, 1), Types.TUPLE(Types.STRING, Types.INT))
                .keyBy(kv -> kv.f0, Types.STRING)
                .sum(1)
                .print();

        env.execute();


    }
}
